House Prices

1 Objetivos

  • Número de variáveis: 81
  • Tipo de variáveis
      • Inteiras ou discretas:
      • Numéricas ou double
      • Categóricas
      • Qualitativas
    • Qualidade dos dados
      • Quantidade de NA’s por variável
  • Criação de novas variáveis, se precisar
  • Transformação das variáveis, se precisar

2 Conjunto de Dados de Teste

Iremos realizar nos dados de teste as mesmas transformações aplicadas aos dados de treino.

df.test <- data.table::fread('../dados/test.csv', 
                             sep=",", 
                             showProgress = FALSE)  %>% 
            data.frame(stringsAsFactors = F)
df.test

3 Separando o conjunto de dados de teste pelo tipo.

3.1 Dados tipo inteiro

test.int <- df.test[,unlist(lapply(df.test,class)) %in% "integer"]
test.int

na amostra test.int também irei imputar dados.

apply(is.na(test.int),2,function(x) round(100*sum(as.numeric(x))/length(x),2)) %>% 
           sort(decreasing = T)
##   LotFrontage   GarageYrBlt    MasVnrArea  BsmtFullBath  BsmtHalfBath 
##         15.56          5.35          1.03          0.14          0.14 
##    BsmtFinSF1    BsmtFinSF2     BsmtUnfSF   TotalBsmtSF    GarageCars 
##          0.07          0.07          0.07          0.07          0.07 
##    GarageArea            Id    MSSubClass       LotArea   OverallQual 
##          0.07          0.00          0.00          0.00          0.00 
##   OverallCond     YearBuilt  YearRemodAdd     X1stFlrSF     X2ndFlrSF 
##          0.00          0.00          0.00          0.00          0.00 
##  LowQualFinSF     GrLivArea      FullBath      HalfBath  BedroomAbvGr 
##          0.00          0.00          0.00          0.00          0.00 
##  KitchenAbvGr  TotRmsAbvGrd    Fireplaces    WoodDeckSF   OpenPorchSF 
##          0.00          0.00          0.00          0.00          0.00 
## EnclosedPorch    X3SsnPorch   ScreenPorch      PoolArea       MiscVal 
##          0.00          0.00          0.00          0.00          0.00 
##        MoSold        YrSold 
##          0.00          0.00

4 Selecionando as variáveis

Removendo as mesmas variáveis do conjunto de treinamento.

cols.int <- readRDS('../outputs/cols.int.rds')
test.int <- test.int[,c('Id',cols.int)] %>%
                        select(-LotFrontage)
preProcess_missingdata_model <- preProcess(test.int[,-1], method='knnImpute')
preProcess_missingdata_model
## Created from 1365 samples and 27 variables
## 
## Pre-processing:
##   - centered (27)
##   - ignored (0)
##   - 5 nearest neighbor imputation (27)
##   - scaled (27)

Vamos agora usar esse modelo para prever os valores ausentes df.int

test.int1 <- predict(preProcess_missingdata_model, newdata = test.int[,-1] )
anyNA(test.int1)
## [1] FALSE

dados imputados com sucesso!

test.int <- data.frame(Id = test.int$Id,test.int1)

4.1 Tranformando dados tipo string em categorical

test.fac <- df.test[,unlist(lapply(df.test,class)) %in% "character"] %>%
                           apply(2,as.factor) %>% data.frame

removendo as mesmas colunas do conjunto de treino.

test.fac <- test.fac %>% select(-PoolQC,-MiscFeature,
                                -Alley,-Fence,-FireplaceQu)

cols.fac <- readRDS('../outputs/cols.fac.rds')
test.fac <- test.fac[,cols.fac]

verificando a porcentagem de valores nulos

apply(is.na(test.fac),2,function(x) round(100*sum(as.numeric(x))/length(x),2)) %>% 
           sort(decreasing = T)
##  GarageFinish    GarageCond    GarageType      BsmtCond      BsmtQual 
##          5.35          5.35          5.21          3.08          3.02 
##  BsmtExposure  BsmtFinType1    MasVnrType      MSZoning    Functional 
##          3.02          2.88          1.10          0.27          0.14 
##   Exterior1st   Exterior2nd   KitchenQual      LotShape   LandContour 
##          0.07          0.07          0.07          0.00          0.00 
##     LandSlope  Neighborhood    Condition1      BldgType    HouseStyle 
##          0.00          0.00          0.00          0.00          0.00 
##     RoofStyle      RoofMatl     ExterQual     ExterCond    Foundation 
##          0.00          0.00          0.00          0.00          0.00 
##     HeatingQC    CentralAir    Electrical    PavedDrive SaleCondition 
##          0.00          0.00          0.00          0.00          0.00

Novamente recorremos ao caret para imputar essas categorias.

Para construir um modelo que imput a categoria vamos retirar todas as variáveis que possuam alguma porcentagem de valores nulos e deixar somente uma delas em cada modelo.

4.2 Criando os data frames

df.GarageType <- test.fac %>% 
                  select(-GarageFinish,-GarageCond,
                         -BsmtExposure,-BsmtQual,
                         -BsmtCond,-BsmtFinType1,
                         -MasVnrType,
                         -MSZoning,-Functional,
                         -Exterior1st,-Exterior2nd,
                         -KitchenQual)
df.GarageFinish <- test.fac %>% 
                    select(-GarageType,-GarageCond,
                           -BsmtExposure,-BsmtQual,
                           -BsmtCond,-BsmtFinType1,
                           -MasVnrType,
                           -MSZoning,-Functional,
                           -Exterior1st,-Exterior2nd,
                           -KitchenQual)
df.GarageCond <- test.fac %>% 
                  select(-GarageType,-GarageFinish,
                         -BsmtExposure,-BsmtQual,
                         -BsmtCond,-BsmtFinType1,
                         -MasVnrType,
                         -MSZoning,-Functional,
                         -Exterior1st,-Exterior2nd,
                         -KitchenQual)
df.BsmtExposure <- test.fac %>% 
                    select(-GarageType,-GarageFinish,
                           -GarageCond,-BsmtQual,
                           -BsmtCond,-BsmtFinType1,
                           -MasVnrType,
                           -MSZoning,-Functional,
                           -Exterior1st,-Exterior2nd,
                           -KitchenQual)
df.BsmtQual <- test.fac %>% 
                select(-GarageType,-GarageFinish,
                       -GarageCond,-BsmtExposure,
                       -BsmtCond,-BsmtFinType1,
                       -MasVnrType,
                       -MSZoning,-Functional,
                       -Exterior1st,-Exterior2nd,
                       -KitchenQual)
df.BsmtCond <- test.fac %>% 
                select(-GarageType,-GarageFinish,
                       -GarageCond,-BsmtExposure,
                       -BsmtQual,-BsmtFinType1,
                       -MasVnrType,
                       -MSZoning,-Functional,
                       -Exterior1st,-Exterior2nd,
                       -KitchenQual)
df.BsmtFinType1 <- test.fac %>% 
                    select(-GarageType,-GarageFinish,
                           -GarageCond,-BsmtExposure,
                           -BsmtQual,-BsmtCond,
                           -MasVnrType,
                           -MSZoning,-Functional,
                           -Exterior1st,-Exterior2nd,
                           -KitchenQual)
df.MasVnrType <- test.fac %>% 
                  select(-GarageType,-GarageFinish,
                         -GarageCond,-BsmtExposure,
                         -BsmtQual,-BsmtCond,
                         -BsmtFinType1,
                         -MSZoning,-Functional,
                         -Exterior1st,-Exterior2nd,
                         -KitchenQual)
df.MSZoning <- test.fac %>% 
                  select(-GarageType,-GarageFinish,
                         -GarageCond,-BsmtExposure,
                         -BsmtQual,-BsmtCond,
                         -BsmtFinType1,-MasVnrType,
                         -Functional,
                         -Exterior1st,-Exterior2nd,
                         -KitchenQual)
df.Functional <- test.fac %>% 
                  select(-GarageType,-GarageFinish,
                         -GarageCond,-BsmtExposure,
                         -BsmtQual,-BsmtCond,
                         -BsmtFinType1,-MasVnrType,
                         -MSZoning,
                         -Exterior1st,-Exterior2nd,
                         -KitchenQual)
df.Exterior1st <- test.fac %>% 
                    select(-GarageType,-GarageFinish,
                           -GarageCond,-BsmtExposure,
                           -BsmtQual,-BsmtCond,
                           -BsmtFinType1,-MasVnrType,
                           -MSZoning,
                           -Functional,-Exterior2nd,
                           -KitchenQual)
df.Exterior2nd <- test.fac %>% 
                    select(-GarageType,-GarageFinish,
                           -GarageCond,-BsmtExposure,
                           -BsmtQual,-BsmtCond,
                           -BsmtFinType1,-MasVnrType,
                           -MSZoning,-Functional,
                           -Exterior1st,-KitchenQual)
df.KitchenQual <- test.fac %>% 
                    select(-GarageType,-GarageFinish,
                           -GarageCond,-BsmtExposure,
                           -BsmtQual,-BsmtCond,
                           -BsmtFinType1,-MasVnrType,
                           -MSZoning,
                           -Functional,-Exterior1st,
                           -Exterior2nd)

criando um vetor com o nome das variáveis e uma lista com os data-frames criados.

vars <- c('GarageType','GarageFinish',
          'GarageCond','BsmtExposure',
          'BsmtQual','BsmtCond',
          'BsmtFinType1','MasVnrType',
          'MSZoning','Functional','Exterior1st',
          'Exterior2nd','KitchenQual')

list.df <- list(df.GarageType,df.GarageFinish,
                df.GarageCond,df.BsmtExposure,
                df.BsmtQual,df.BsmtCond,
                df.BsmtFinType1,df.MasVnrType,
                df.MSZoning,df.Functional,
                df.Exterior1st,df.Exterior2nd,df.KitchenQual)

A função abaixo automatiza o processo de impute das categoricas nos valores nulos de cada variável.

f.pred <- function(fac,df.var,rf.model,var){ 

  new.df <- df.var[is.na(df.var[,var]),!(names(df.var) %in% var)]
  pred_rf <- predict(rf.model, newdata =  new.df)
  fac[is.na(fac[,var]),var] <- pred_rf

  return(fac)
}

4.3 Usando a Caret e RandomForest

Utilizarei o random forest como classificador para imputar as categorias faltantes.

set.seed(12345)
fitControl <- trainControl(method="cv", 
                           number=3, 
                           savePredictions = 'final',
                           classProbs= F, 
                           summaryFunction = multiClassSummary)

Construindos os modelos para imputar os valores nulos.

set.seed(12345)

# Crio uma lista para armazenar os modelos
rf.list <- list()

for(j in 1:length(vars)){ 

 # atribuo em df um df."variavel" sem os valor nulos       
 df <- list.df[[j]] %>% na.omit()  

 # treino esse df. no random-forest 
 rf.list[[j]] <- train(eval(parse(text = paste(vars[j],'~.'))),
                                      data = df, 
                                      tuneLength=5,
                                      trControl = fitControl,method='rf')

 # imputo as categorias faltantes no valores nulos das variáveis
 test.fac <- f.pred(test.fac,list.df[[j]],rf.list[[j]],vars[j])
 
 cat(j,'  ')
 
}
## 1   2   3   4   5   6   7   8   9   10   11   12   13

Será há algum valor nulos ?

anyNA(test.fac)
## [1] FALSE

Imputação de dados realizada com sucesso !

5 Juntandos os data frames

Jutando os dados tipo inteiros e categóricos.

df.test <- bind_cols(test.int,test.fac)
df.test

Agora nosso df.test encontra-se limpo e pronto para ser explorado.

6 Exportando os dados limpos

write.csv(df.test,'../outputs/df.test.csv')

Sérgio Carvalho

22 março, 2019